import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
# Marvel color scheme (dark theme)
marvel_colors = {
'background': '#0F0F23',
'surface': '#1A1A2E',
'primary': '#FF6B6B',
'secondary': '#4ECDC4',
'accent': '#FFD93D',
'text': '#FFFFFF',
'text_secondary': '#B0B0B0'
}
# Data Loading and Exploration
def load_and_explore_data(file_path):
"""
Load CSV into DataFrame with error handling.
Then print shape, info, missing values, and basic statistics.
"""
try:
df = pd.read_csv('Marvel-Movies.csv')
print(f'Shape: {df.shape}')
print('\nInfo:')
print(df.info())
print(f'\nMissing values per column:\n{df.isna().sum()}')
print('\nBasic statistics:')
print(df.describe(include='all'))
return df
except Exception as e:
print(f"❌ Error loading data: {e}")
return None
# Cleaning and Preprocessing
def clean_percentage_columns(df):
"""
Remove '%' and commas from percentage-like columns, convert to numeric.
"""
percent_cols = [
'% budget recovered', 'critics % score', 'audience % score',
'audience vs critics % deviance', '1st vs 2nd weekend drop off',
'% budget opening weekend'
]
for col in percent_cols:
if col in df.columns:
df[col] = (
df[col]
.astype(str)
.str.replace('%', '', regex=False)
.str.replace(',', '', regex=False)
)
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def handle_missing_values(df):
"""
Impute missing numeric columns with median.
Drop rows where essential columns are still missing.
"""
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
essential = ['budget', 'domestic gross ($m)', 'international gross ($m)', 'worldwide gross', 'year']
df.dropna(subset=essential, inplace=True)
return df
def remove_outliers_zscore(df, columns, threshold=3.0):
"""
Remove outliers based on z-score for specified columns.
Keeps rows where all absolute z-scores < threshold.
"""
df_out = df.copy()
for col in columns:
if col in df_out.columns:
col_zscore = (df_out[col] - df_out[col].mean()) / df_out[col].std(ddof=0)
df_out = df_out[np.abs(col_zscore) < threshold]
return df_out
# Feature Engineering
def create_basic_features(df):
"""
Add ROI, domestic_to_international, movie_age, is_team_movie, is_sequel.
Also group categories and define MCU phase.
"""
df = df.copy()
# Domestic vs International ratio
df['domestic_to_international'] = df['domestic gross ($m)'] / (df['international gross ($m)'] + 1e-6)
# Movie age
df['movie_age'] = 2025 - df['year']
# Group categories: map 'Unique' to 'Other'
df['category_grouped'] = df['category'].replace({'Unique': 'Other'})
# MCU Phase based on year
df['mcu_phase'] = pd.cut(
df['year'],
bins=[0, 2012, 2015, 2019, 2025],
labels=['Phase 1', 'Phase 2', 'Phase 3', 'Phase 4+']
)
# Flag if it's a team movie (contains 'Avengers' or 'Guardians')
if 'category' in df.columns:
df['is_team_movie'] = df['category'].isin(['Avengers'])
# Flag if it's a sequel:
# Any title with a colon is a sequel, except two exceptions.
exceptions = ['Avengers: Age of Ultron', 'Spider-Man: Homecoming']
df['is_sequel'] = df['film'].apply(
lambda x: (':' in x and x not in exceptions)
)
return df
def encode_and_select_features(df):
"""
Standardize numeric features, one-hot encode categorical,
train RandomForest to get feature importances.
Returns DataFrame of features for modeling and top importances.
"""
df_feat = df.copy()
# Identify numeric columns to scale (must exist)
numeric_cols = [
'budget', 'domestic gross ($m)', 'international gross ($m)', 'worldwide gross',
'opening weekend ($m)', 'second weekend ($m)', '% budget recovered',
'critics % score', 'audience % score', 'audience vs critics % deviance',
'1st vs 2nd weekend drop off', 'movie_age', 'domestic_to_international'
]
numeric_cols = [c for c in numeric_cols if c in df_feat.columns]
# Scale numeric
scaler = StandardScaler()
df_feat[numeric_cols] = scaler.fit_transform(df_feat[numeric_cols])
# One-hot encode categorical columns
categorical_cols = []
for col in ['category_grouped', 'source', 'mcu_phase', 'is_team_movie', 'is_sequel']:
if col in df_feat.columns:
categorical_cols.append(col)
if categorical_cols:
# Use sparse_output=False for newer sklearn versions
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded = encoder.fit_transform(df_feat[categorical_cols])
encoded_cols = encoder.get_feature_names_out(categorical_cols)
df_encoded = pd.DataFrame(encoded, columns=encoded_cols, index=df_feat.index)
df_feat = pd.concat([df_feat.reset_index(drop=True), df_encoded.reset_index(drop=True)], axis=1)
df_feat.drop(columns=categorical_cols, inplace=True)
# Prepare X and y for feature importance
X = df_feat.select_dtypes(include=[np.number]).copy()
y = df['worldwide gross']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
top_features = importances.head(10).index.tolist()
top_importances = importances.head(10).values.tolist()
return X, top_features, top_importances
# Main Processing Pipeline
df = load_and_explore_data('Marvel-Movies.csv')
df = clean_percentage_columns(df)
df = handle_missing_values(df)
df = remove_outliers_zscore(df, ['budget', 'worldwide gross'], threshold=3.0)
df = create_basic_features(df)
# Generate features and get importances
df_features, top_features, top_importances = encode_and_select_features(df)
# Interactive Dash Dashboard
app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'], suppress_callback_exceptions=True)
app.layout = html.Div(
style={
'backgroundColor' : marvel_colors['background'],
'color': marvel_colors['text'],
'padding': '20px'
},
children=[
html.H1(
'Marvel Movies Analysis Dashboard',
style={
'textAlign': 'center'
}
),
dcc.Tabs(id='tabs', value='tab-visuals', children=[
dcc.Tab(
label='Visualizations',
value='tab-visuals',
style={
'backgroundColor': marvel_colors['surface'],
'color': marvel_colors['text']
}
),
dcc.Tab(
label='Insights',
value='tab-insights',
style={
'backgroundColor': marvel_colors['surface'],
'color': marvel_colors['text']
}
),
]),
html.Div(
id='tabs-content',
style={'marginTop': '20px'}
)
]
)
@app.callback(Output('tabs-content', 'children'), Input('tabs', 'value'))
def render_content(tab):
if tab == 'tab-visuals':
year_min, year_max = int(df['year'].min()), int(df['year'].max())
budget_min, budget_max = float(df['budget'].min()), float(df['budget'].max())
categories = df['category_grouped'].unique()
return html.Div([
html.Div(style={'display': 'flex', 'justifyContent': 'space-between', 'flexWrap': 'wrap'}, children=[
html.Div([
html.Label(
'Year Range',
style={'color': marvel_colors['text']}
),
dcc.RangeSlider(
id='year-slider',
min=year_min,
max=year_max,
value=[year_min, year_max],
marks={str(y): str(y) for y in range(year_min, year_max+1, 2)},
step=1,
tooltip={'placement': 'bottom', 'always_visible': False}
)
], style={'width': '30%', 'marginBottom': '20px'}),
html.Div([
html.Label(
'Budget Range ($M)',
style={'color': marvel_colors['text']}
),
dcc.RangeSlider(
id='budget-slider',
min=budget_min,
max=budget_max,
value=[budget_min, budget_max],
marks=None,
tooltip={'placement': 'bottom', 'always_visible': False}
)
], style={'width': '30%', 'marginBottom': '20px'}),
html.Div([
html.Label(
'Category',
style={'color': marvel_colors['text']}
),
dcc.Dropdown(
id='category-dropdown',
options=[{'label': cat, 'value': cat} for cat in sorted(categories)],
value=[],
multi=True,
placeholder='Select categories'
)
], style={'width': '30%', 'marginBottom': '20px'}),
]),
html.Div([
dcc.Graph(id='scatter-domestic-international'),
dcc.Graph(id='bar-top-movies'),
dcc.Graph(id='heatmap-corr'),
dcc.Graph(id='box-gross')
], style={'display': 'grid', 'gridTemplateColumns': '1fr 1fr', 'gap': '20px'})
], style={'padding': '20px'})
elif tab == 'tab-insights':
# Specific answers
top5_overall = df.nlargest(5, 'worldwide gross')[['film', 'worldwide gross', 'domestic gross ($m)', 'international gross ($m)']]
avg_dom = df['domestic gross ($m)'].mean()
avg_intl = df['international gross ($m)'].mean()
best_films = top5_overall['film'].tolist()
top5_overall['diff'] = top5_overall['international gross ($m)'] - top5_overall['domestic gross ($m)']
insights_text = [
f"Top 5 Marvel movies by worldwide gross: {', '.join(best_films)}.",
f"Average domestic gross (all movies): ${avg_dom:.2f}M.",
f"Average international gross (all movies): ${avg_intl:.2f}M."
]
diff_table = top5_overall[['film', 'domestic gross ($m)', 'international gross ($m)', 'diff']].reset_index(drop=True)
# Show the is_sequel flag for each film (including exceptions)
sequel_flags = df[['film', 'is_sequel']].copy()
sequel_flags = sequel_flags.sort_values('film').reset_index(drop=True)
importances_df = pd.DataFrame({
'Feature': top_features,
'Importance': [round(i, 4) for i in top_importances]
})
return html.Div([
html.H2('Analytical Insights'),
html.Div([html.P(line, style={'color': marvel_colors['text_secondary']}) for line in insights_text]),
dcc.Graph(figure=go.Figure(
data=[go.Table(
header=dict(values=list(diff_table.columns),fill_color=marvel_colors['surface'], font_color=marvel_colors['text']),
cells=dict(values=[diff_table[col] for col in diff_table.columns],
fill_color=marvel_colors['surface'], font_color=marvel_colors['text_secondary'])
)],
layout=go.Layout(template='plotly_dark', title='Top 5: Domestic vs International ($M)')
)),
html.H3('Top 5 Movies by Worldwide Gross'),
dcc.Graph(figure=px.bar(
top5_overall,
x='film',
y='worldwide gross',
title='Top 5 Worldwide Gross',
template='plotly_dark'
).update_layout(xaxis_tickangle=-45)),
html.H3('Top 10 Feature Importances'),
dcc.Graph(figure=px.bar(
importances_df,
x='Feature',
y='Importance',
title='Feature Importances',
template='plotly_dark',
text='Importance'
).update_traces(textposition='outside'))
], style={'padding': '20px'})
# Callback to update visualizations
@app.callback(
Output('scatter-domestic-international', 'figure'),
Output('bar-top-movies', 'figure'),
Output('heatmap-corr', 'figure'),
Output('box-gross', 'figure'),
Input('year-slider', 'value'),
Input('budget-slider', 'value'),
Input('category-dropdown', 'value')
)
def update_visuals(year_range, budget_range, selected_categories):
dff = df.copy()
dff = dff[(dff['year'] >= year_range[0]) & (dff['year'] <= year_range[1])]
dff = dff[(dff['budget'] >= budget_range[0]) & (dff['budget'] <= budget_range[1])]
if selected_categories:
dff = dff[dff['category_grouped'].isin(selected_categories)]
if dff.empty:
empty_fig = go.Figure().update_layout(template='plotly_dark')
return empty_fig, empty_fig, empty_fig, empty_fig
scatter_fig = px.scatter(
dff,
x='domestic gross ($m)',
y='international gross ($m)',
size='budget',
color='year',
hover_name='film',
title='Domestic vs International Gross ($M)',
template='plotly_dark'
)
if len(dff) > 1 and dff['domestic gross ($m)'].nunique() > 1:
coeffs = np.polyfit(dff['domestic gross ($m)'], dff['international gross ($m)'], 1)
trendline = np.poly1d(coeffs)(dff['domestic gross ($m)'])
scatter_fig.add_trace(go.Scatter(
x=dff['domestic gross ($m)'],
y=trendline,
mode='lines',
name='Trendline',
line=dict(color=marvel_colors['primary'])
))
top10 = dff.nlargest(10, 'worldwide gross')
bar_fig = px.bar(
top10,
x='film',
y='worldwide gross',
color='year',
title='Top 10 Movies by Worldwide Gross',
template='plotly_dark'
).update_layout(xaxis_tickangle=-45)
corr_cols = [
'budget', 'domestic gross ($m)', 'international gross ($m)',
'worldwide gross', 'opening weekend ($m)', 'second weekend ($m)'
]
corr_cols = [c for c in corr_cols if c in dff.columns]
corr_matrix = dff[corr_cols].corr()
heatmap_fig = go.Figure(
data=go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns, colorscale='Viridis')
)
heatmap_fig.update_layout(title='Correlation Heatmap', template='plotly_dark')
box_fig = px.box(
dff,
x='category_grouped',
y='worldwide gross',
color='category_grouped',
title='Worldwide Gross by Category',
template='plotly_dark'
)
return scatter_fig, bar_fig, heatmap_fig, box_fig
# Run the Dash server
if __name__ == '__main__':
app.run_server(debug=True)